1 package org.apache.lucene.search.spell; 2 3 /* 4 * Licensed to the Apache Software Foundation (ASF) under one or more 5 * contributor license agreements. See the NOTICE file distributed with 6 * this work for additional information regarding copyright ownership. 7 * The ASF licenses this file to You under the Apache License, Version 2.0 8 * (the "License"); you may not use this file except in compliance with 9 * the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 import org.apache.lucene.util.IntsRef; 21 22 /** 23 * Damerau-Levenshtein (optimal string alignment) implemented in a consistent 24 * way as Lucene's FuzzyTermsEnum with the transpositions option enabled. 25 * 26 * Notes: 27 * <ul> 28 * <li> This metric treats full unicode codepoints as characters 29 * <li> This metric scales raw edit distances into a floating point score 30 * based upon the shortest of the two terms 31 * <li> Transpositions of two adjacent codepoints are treated as primitive 32 * edits. 33 * <li> Edits are applied in parallel: for example, "ab" and "bca" have 34 * distance 3. 35 * </ul> 36 * 37 * NOTE: this class is not particularly efficient. It is only intended 38 * for merging results from multiple DirectSpellCheckers. 39 */ 40 public final class LuceneLevenshteinDistance implements StringDistance { 41 42 /** 43 * Creates a new comparator, mimicing the behavior of Lucene's internal 44 * edit distance. 45 */ 46 public LuceneLevenshteinDistance() {} 47 48 @Override 49 public float getDistance(String target, String other) { 50 IntsRef targetPoints; 51 IntsRef otherPoints; 52 int n; 53 int d[][]; // cost array 54 55 // NOTE: if we cared, we could 3*m space instead of m*n space, similar to 56 // what LevenshteinDistance does, except cycling thru a ring of three 57 // horizontal cost arrays... but this comparator is never actually used by 58 // DirectSpellChecker, it's only used for merging results from multiple shards 59 // in "distributed spellcheck", and it's inefficient in other ways too... 60 61 // cheaper to do this up front once 62 targetPoints = toIntsRef(target); 63 otherPoints = toIntsRef(other); 64 n = targetPoints.length; 65 final int m = otherPoints.length; 66 d = new int[n+1][m+1]; 67 68 if (n == 0 || m == 0) { 69 if (n == m) { 70 return 0; 71 } 72 else { 73 return Math.max(n, m); 74 } 75 } 76 77 // indexes into strings s and t 78 int i; // iterates through s 79 int j; // iterates through t 80 81 int t_j; // jth character of t 82 83 int cost; // cost 84 85 for (i = 0; i<=n; i++) { 86 d[i][0] = i; 87 } 88 89 for (j = 0; j<=m; j++) { 90 d[0][j] = j; 91 } 92 93 for (j = 1; j<=m; j++) { 94 t_j = otherPoints.ints[j-1]; 95 96 for (i=1; i<=n; i++) { 97 cost = targetPoints.ints[i-1]==t_j ? 0 : 1; 98 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost 99 d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost); 100 // transposition 101 if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) { 102 d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost); 103 } 104 } 105 } 106 107 return 1.0f - ((float) d[n][m] / Math.min(m, n)); 108 } 109 110 private static IntsRef toIntsRef(String s) { 111 IntsRef ref = new IntsRef(s.length()); // worst case 112 int utf16Len = s.length(); 113 for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) { 114 cp = ref.ints[ref.length++] = Character.codePointAt(s, i); 115 } 116 return ref; 117 } 118 }